This is the analysis by county. The main notebook of the whole analysis is located at Analysis.ipynb.
We will make use of the following libraries in this notebook:
import pandas as pd
import json
import plotly.express as px
import scipy.stats as sp
import numpy as np
from IPython.display import display, Markdown
We also import our own constants and functions.
from own_data import candidates, candidates_colors, poland_center, poland_zoom, map_margin, opacity
from utils import comma_to_dot, get_last_name
We read the csv files with the results by county given as a percentage. The data is taken from the website of the National Electoral Commission. Poland uses comma as a decimal separator. We convert the data to dot-separated numbers so that it works better with the libraries.
results_counties_percent_df = pd.read_csv('data/results/results_by_county_percent.csv', sep=';')
results_counties_percent_df = results_counties_percent_df[['Kod TERYT', 'Powiat'] + candidates]
for candidate in candidates:
results_counties_percent_df[candidate] = results_counties_percent_df[candidate].map(comma_to_dot)
results_counties_percent_df.head()
Additionally, we import the geographical data about borders of each county. The data is derived from the Head Office of Geodesy and Cartography. The webiste of GIS Support PL let us solely download the package with counties. To create maps I will use GeoJSON format. The data from the websites mentioned before has the .shp extension, so I have formatted it to GeoJSON using MapShaper.
with open('data/geojson/counties.json', encoding='utf-8') as response:
counties = json.load(response)
counties['features'][0]['properties']
The TERYT code is a unique code of each administrative unit. In the election results the code has two extra 0s. Additionally, it doesn't have a leading zero when a voivodeship number consists only of one digit. We are going to fix these issues to connect these two data sets.
def fix_teryt_county(teryt):
"""Fix TERYT code to integrate the two datasets for counties."""
teryt = str(teryt)
if len(teryt) == 5:
teryt = '0' + teryt
return teryt[:-2]
results_counties_percent_df['Kod TERYT'] = results_counties_percent_df['Kod TERYT'].astype(str).map(fix_teryt_county)
results_counties_percent_df.head()
This is the location of the key that will join our data sets in counties JSON:
counties['features'][0]['properties']['JPT_KOD_JE']
We finally plot the data on maps.
def get_figure_results_by_county(candidate):
"""Get figure showing a map of results of the given cadidate by county."""
candidate_df = results_counties_percent_df[['Kod TERYT', 'Powiat', candidate]]
# We remove the results from ships and abroad because they will not be shown on the map.
candidate_df = candidate_df[candidate_df.Powiat != 'statki']
candidate_df = candidate_df[candidate_df.Powiat != 'zagranica']
fig = px.choropleth_mapbox(
candidate_df, geojson=counties, color=candidate,
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center=poland_center,
opacity=opacity, color_continuous_scale=candidates_colors[candidate],
hover_data={'Powiat': True, 'Kod TERYT': False},
mapbox_style="carto-positron", zoom=poland_zoom
)
fig.update_layout(margin=map_margin)
return fig
for candidate in candidates:
display(Markdown(f'### Results of {candidate} by county'))
get_figure_results_by_county(candidate).show()
winners_counties_df = pd.concat([
results_counties_percent_df[candidates].idxmax(axis=1).rename('Winner').to_frame(),
results_counties_percent_df[candidates].max(axis=1).rename('Result').to_frame(),
results_counties_percent_df[['Powiat', 'Kod TERYT']]
], axis=1)
winners_counties_df.head(1)
winners_counties_fig = px.choropleth_mapbox(
winners_counties_df, geojson=counties, color='Winner',
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center=poland_center,
opacity=opacity, color_discrete_sequence=px.colors.qualitative.D3,
hover_data={'Powiat': True, 'Kod TERYT': False, 'Result': True},
mapbox_style="carto-positron", zoom=poland_zoom
)
winners_counties_fig.update_layout(margin=map_margin)
winners_counties_fig.show()
values = results_counties_percent_df[candidates].values
first_third_highest = values[
np.arange(len(results_counties_percent_df))[:,None],np.argpartition(-values,np.arange(4),axis=1)[:,:4]
]
second_values = first_third_highest[:,1]
second_values[:5]
second_values_df = pd.DataFrame(second_values, columns=['Result'])
second_values_df.head()
def get_col_name(row):
"""Get the column name of the column which has the value in the corresponding data frame."""
b = (results_counties_percent_df.loc[row.name] == row['Result'])
return b.index[b.argmax()]
second_places_df = pd.concat([
second_values_df.apply(get_col_name, axis=1).rename('Second place').to_frame(),
second_values_df,
results_counties_percent_df[['Powiat', 'Kod TERYT']]
], axis=1)
second_places_df.head()
second_places_fig = px.choropleth_mapbox(
second_places_df, geojson=counties, color='Second place',
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center=poland_center, opacity=opacity,
color_discrete_sequence=['#FF7F0E', '#1F77B4', 'rgb(102,102,102)', 'rgb(255,217,47)'],
hover_data={'Powiat': True, 'Kod TERYT': False, 'Result': True},
mapbox_style="carto-positron", zoom=poland_zoom
)
second_places_fig.update_layout(margin=map_margin)
second_places_fig.show()
third_values = first_third_highest[:,2]
third_values_df = pd.DataFrame(third_values, columns=['Result'])
third_places_df = pd.concat([
third_values_df.apply(get_col_name, axis=1).rename('Third place').to_frame(),
third_values_df,
results_counties_percent_df[['Powiat', 'Kod TERYT']]
], axis=1)
third_places_df.head()
third_places_fig = px.choropleth_mapbox(
third_places_df, geojson=counties, color='Third place',
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center=poland_center, opacity=opacity,
color_discrete_sequence=['rgb(255,217,47)', 'rgb(102,102,102)', '#FF7F0E'],
hover_data={'Powiat': True, 'Kod TERYT': False, 'Result': True},
mapbox_style="carto-positron", zoom=poland_zoom
)
third_places_fig.update_layout(margin=map_margin)
third_places_fig.show()
Analyzing these maps, one can see that for some candidates their voters are spread similarly around the whole country. Meanwhile, the others have much greater support in some regions. Who is the candidate of the most equally spread electorate?
coefficient_of_variation_df = pd.DataFrame(
results_counties_percent_df[candidates].apply(sp.variation)
).sort_values(by=0).transpose()
coefficient_of_variation_df
coefficient_of_variation_df = coefficient_of_variation_df.transpose().reset_index()
coefficient_of_variation_df.columns = ['Candidate', 'Coefficient of variation']
coefficient_of_variation_df['Candidate'] = coefficient_of_variation_df['Candidate'].apply(get_last_name)
coefficient_of_variation_fig = px.bar(
coefficient_of_variation_df, x='Candidate', y='Coefficient of variation',
color='Coefficient of variation', color_continuous_scale=px.colors.diverging.RdYlGn[::-1],
title='Coefficient of variation of voters by county',
)
coefficient_of_variation_fig.show()
As we see, Krzysztof Bosak is the most equally supported candidate in Poland. He is follwed by StanisĆaw ƻóĆek and Andrzej Duda. RafaĆ Trzaskowski is the 8th in this comparison. Marek Jakubiak is at the end of the list.
The crucial challange Andrzej Duda and RafaĆ Trzaskowski will need to face in the second round is to convince the voters who did not vote for them in the first round. Which counties have the most voters to convince? In other words, what counties should the two candidates focus on the most in the campaign?
We first find the number of voters of the other candidates in each county.
results_counties_df = pd.read_csv('data/results/results_by_county.csv', sep=';')
candidates_2nd_round = ['Andrzej Sebastian DUDA', 'RafaĆ Kazimierz TRZASKOWSKI']
candidates_no_2nd_round = [
candidate
for candidate in candidates
if candidate not in candidates_2nd_round
]
candidates_no_2nd_round_df = pd.DataFrame(results_counties_df[candidates_no_2nd_round].sum(axis=1))
candidates_no_2nd_round_df.columns = ['Other electorate']
results_potential_2nd_round_df = pd.concat(
[results_counties_df[['Powiat', 'Kod TERYT']], candidates_no_2nd_round_df], axis=1
)
results_potential_2nd_round_df['Kod TERYT'] = results_potential_2nd_round_df['Kod TERYT'].astype(str).map(fix_teryt_county)
results_potential_2nd_round_df.head()
We plot it.
# We remove the results from ships and abroad because they will not be shown on the map
results_potential_2nd_round_df = results_potential_2nd_round_df[results_potential_2nd_round_df.Powiat != 'statki']
results_potential_2nd_round_df = results_potential_2nd_round_df[results_potential_2nd_round_df.Powiat != 'zagranica']
results_potential_2nd_round_fig = px.choropleth_mapbox(
results_potential_2nd_round_df, geojson=counties, color='Other electorate',
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center={"lat": 52, "lon": 19.1451},
opacity=0.8, color_continuous_scale=px.colors.sequential.Reds,
hover_data={'Powiat': True, 'Kod TERYT': False},
mapbox_style="carto-positron", zoom=5.2
)
results_potential_2nd_round_fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
results_potential_2nd_round_fig.show()
They are mainly in big cities. It might be better to see how this looks in relative electorate.
candidates_no_2nd_round_percent_df = pd.DataFrame(results_counties_percent_df[candidates_no_2nd_round].sum(axis=1))
candidates_no_2nd_round_percent_df.columns = ['Other electorate [%]']
results_potential_2nd_round_percent_df = pd.concat(
[results_counties_df[['Powiat', 'Kod TERYT']], candidates_no_2nd_round_percent_df], axis=1
)
results_potential_2nd_round_percent_df['Kod TERYT'] = \
results_potential_2nd_round_percent_df['Kod TERYT'].astype(str).map(fix_teryt_county)
# We remove the results from ships and abroad because they will not be shown on the map
results_potential_2nd_round_percent_df = \
results_potential_2nd_round_percent_df[results_potential_2nd_round_percent_df.Powiat != 'statki']
results_potential_2nd_round_percent_df = \
results_potential_2nd_round_percent_df[results_potential_2nd_round_percent_df.Powiat != 'zagranica']
results_potential_2nd_round_percent_fig = px.choropleth_mapbox(
results_potential_2nd_round_percent_df, geojson=counties, color='Other electorate [%]',
locations='Kod TERYT', featureidkey="properties.JPT_KOD_JE",
center=poland_center,
opacity=0.8, color_continuous_scale=px.colors.sequential.Reds,
hover_data={'Powiat': True, 'Kod TERYT': False},
mapbox_style="carto-positron", zoom=poland_zoom
)
results_potential_2nd_round_percent_fig.update_layout(margin=map_margin)
results_potential_2nd_round_percent_fig.show()
The received map is somewhat similar to the map of people who voted for RafaĆ Trzaskowski. It is thus more likely that he will get more new voters in the second round.